import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier,plot_importance
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import plotly.io as pio
pio.renderers.default='notebook'
Data Exploration¶
df = pd.read_csv("term-deposit-marketing-2020.csv")
print(df)
age job marital education default balance housing loan \
0 58 management married tertiary no 2143 yes no
1 44 technician single secondary no 29 yes no
2 33 entrepreneur married secondary no 2 yes yes
3 47 blue-collar married unknown no 1506 yes no
4 33 unknown single unknown no 1 no no
... ... ... ... ... ... ... ... ...
39995 53 technician married tertiary no 395 no no
39996 30 management single tertiary no 3340 no no
39997 54 admin divorced secondary no 200 no no
39998 34 management married tertiary no 1047 no no
39999 38 technician married secondary no 1442 yes no
contact day month duration campaign y
0 unknown 5 may 261 1 no
1 unknown 5 may 151 1 no
2 unknown 5 may 76 1 no
3 unknown 5 may 92 1 no
4 unknown 5 may 198 1 no
... ... ... ... ... ... ...
39995 cellular 3 jun 107 1 no
39996 cellular 3 jun 238 3 yes
39997 cellular 3 jun 170 1 yes
39998 cellular 3 jun 342 1 no
39999 cellular 3 jun 113 1 no
[40000 rows x 14 columns]
df.head()
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 58 | management | married | tertiary | no | 2143 | yes | no | unknown | 5 | may | 261 | 1 | no |
| 1 | 44 | technician | single | secondary | no | 29 | yes | no | unknown | 5 | may | 151 | 1 | no |
| 2 | 33 | entrepreneur | married | secondary | no | 2 | yes | yes | unknown | 5 | may | 76 | 1 | no |
| 3 | 47 | blue-collar | married | unknown | no | 1506 | yes | no | unknown | 5 | may | 92 | 1 | no |
| 4 | 33 | unknown | single | unknown | no | 1 | no | no | unknown | 5 | may | 198 | 1 | no |
df.tail()
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 39995 | 53 | technician | married | tertiary | no | 395 | no | no | cellular | 3 | jun | 107 | 1 | no |
| 39996 | 30 | management | single | tertiary | no | 3340 | no | no | cellular | 3 | jun | 238 | 3 | yes |
| 39997 | 54 | admin | divorced | secondary | no | 200 | no | no | cellular | 3 | jun | 170 | 1 | yes |
| 39998 | 34 | management | married | tertiary | no | 1047 | no | no | cellular | 3 | jun | 342 | 1 | no |
| 39999 | 38 | technician | married | secondary | no | 1442 | yes | no | cellular | 3 | jun | 113 | 1 | no |
df.describe()
| age | balance | day | duration | campaign | |
|---|---|---|---|---|---|
| count | 40000.000000 | 40000.000000 | 40000.000000 | 40000.000000 | 40000.000000 |
| mean | 40.544600 | 1274.277550 | 16.017225 | 254.824300 | 2.882175 |
| std | 9.641776 | 2903.769716 | 8.278127 | 259.366498 | 3.239051 |
| min | 19.000000 | -8019.000000 | 1.000000 | 0.000000 | 1.000000 |
| 25% | 33.000000 | 54.000000 | 8.000000 | 100.000000 | 1.000000 |
| 50% | 39.000000 | 407.000000 | 17.000000 | 175.000000 | 2.000000 |
| 75% | 48.000000 | 1319.000000 | 21.000000 | 313.000000 | 3.000000 |
| max | 95.000000 | 102127.000000 | 31.000000 | 4918.000000 | 63.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40000 entries, 0 to 39999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 40000 non-null int64 1 job 40000 non-null object 2 marital 40000 non-null object 3 education 40000 non-null object 4 default 40000 non-null object 5 balance 40000 non-null int64 6 housing 40000 non-null object 7 loan 40000 non-null object 8 contact 40000 non-null object 9 day 40000 non-null int64 10 month 40000 non-null object 11 duration 40000 non-null int64 12 campaign 40000 non-null int64 13 y 40000 non-null object dtypes: int64(5), object(9) memory usage: 4.3+ MB
Data cleaning¶
1- Check for missing values
df.isnull().sum()
age 0 job 0 marital 0 education 0 default 0 balance 0 housing 0 loan 0 contact 0 day 0 month 0 duration 0 campaign 0 y 0 dtype: int64
No missing values are present
2- Checking outliers
fig = px.histogram(df[df.age<70], x="age")
fig.show()
We can see that the majority of people have age between 30 and 40. After 60 only few samples are present
fig = px.histogram(df[(-3000<= df['balance']) & (df['balance']<=10000)], x="balance",nbins=10)
fig
The majority of the people have the balance in the range between 0 and 2k with very few samples after 6k.
fig = px.histogram(df['day'], x="day")
fig
counts, bins = np.histogram(df.duration, bins=range(0, 1000, 30))
bins = 0.5 * (bins[:-1] + bins[1:])
fig = px.bar(x=bins, y=counts, labels={'x':'duration', 'y':'count'})
fig
The duration of the last contact seems to be in the range of 500 seconds for most people, the number of people decrease constantly after that
counts, bins = np.histogram(df.campaign, bins=range(0, 21, 6))
bins = 0.5 * (bins[:-1] + bins[1:])
fig = px.bar(x=bins, y=counts, labels={'x':'campaign', 'y':'count'})
fig
The number of contacts performed during this campaign seems to be approximately between one and 1 and 3 for most client rarely exceeding 13.
Visualize data and features engineering¶
subscriber_counts = df['y'].value_counts()
fig = go.Figure(data=[go.Pie(labels=subscriber_counts.keys().tolist(), values=subscriber_counts)])
fig
df['y']=pd.factorize(df['y'])[0]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40000 entries, 0 to 39999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 40000 non-null int64 1 job 40000 non-null object 2 marital 40000 non-null object 3 education 40000 non-null object 4 default 40000 non-null object 5 balance 40000 non-null int64 6 housing 40000 non-null object 7 loan 40000 non-null object 8 contact 40000 non-null object 9 day 40000 non-null int64 10 month 40000 non-null object 11 duration 40000 non-null int64 12 campaign 40000 non-null int64 13 y 40000 non-null int64 dtypes: int64(6), object(8) memory usage: 4.3+ MB
df_sub = df[df.y==1]
df_sub.shape
(2896, 14)
df_not_sub = df[df.y==0]
df_not_sub.shape
(37104, 14)
The data are unbalanced with only 2896 subscriber aganist 37104 non subscriber. We need to remember this in the classification task.
Age¶
label = ['18-20','20-25', '25-30', '30-35','35-40','40-45','45-50','50-55','55-60','60-65','65-70','>70']
num_samp_age = [(df[(18 <= df['age']) & (df['age'] <= 20)]).shape[0],(df[(20 < df['age']) & (df['age'] <= 25)]).shape[0],(df[(25 < df['age']) & (df['age'] <= 30)]).shape[0],(df[(30 < df['age']) & (df['age'] <= 35)]).shape[0],(df[(35 < df['age']) & (df['age'] <= 40)]).shape[0],(df[(40 < df['age']) & (df['age'] <= 45)]).shape[0],(df[(45 < df['age']) & (df['age'] <= 50)]).shape[0],(df[(50 < df['age']) & (df['age'] <= 55)]).shape[0],(df[(55 < df['age']) & (df['age'] <= 60)]).shape[0],(df[(60 < df['age']) & (df['age'] <= 65)]).shape[0],(df[(65 < df['age']) & (df['age'] <= 70)]).shape[0],(df[(df['age'] > 70)]).shape[0]]
perc_age = [round((df[(18 <= df['age']) & (df['age'] <= 20)& (df['y'] == 1)]).shape[0]/num_samp_age[0]*100,2),round((df[(20 < df['age']) & (df['age'] <= 25)& (df['y'] == 1)]).shape[0]/num_samp_age[1]*100,2),round((df[(25 < df['age']) & (df['age'] <= 30)& (df['y'] == 1)]).shape[0]/num_samp_age[2]*100,2),round((df[(30 < df['age']) & (df['age'] <= 35)& (df['y'] == 1)]).shape[0]/num_samp_age[3]*100,2),round((df[(35 < df['age']) & (df['age'] <= 40)& (df['y'] == 1)]).shape[0]/num_samp_age[4]*100,2),round((df[(40 < df['age']) & (df['age'] <= 45)& (df['y'] == 1)]).shape[0]/num_samp_age[5]*100,2),round((df[(45 < df['age']) & (df['age'] <= 50)& (df['y'] == 1)]).shape[0]/num_samp_age[6]*100,2),round((df[(50 < df['age']) & (df['age'] <= 55)& (df['y'] == 1)]).shape[0]/num_samp_age[7]*100,2),round((df[(55 < df['age']) & (df['age'] <= 60)& (df['y'] == 1)]).shape[0]/num_samp_age[8]*100,2),round((df[(60 < df['age']) & (df['age'] <= 65)& (df['y'] == 1)]).shape[0]/num_samp_age[9]*100,2),round((df[(65 < df['age']) & (df['age'] <= 70)& (df['y'] == 1)]).shape[0]/num_samp_age[10]*100,2),round((df[(df['age'] > 70)& (df['y'] == 1)]).shape[0]/num_samp_age[11]*100,2)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_age, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_age, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Age feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Age range")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
The subscriber percentage increase after 60 years old
label = ['18-30','30-40', '40-50', '50-60','>60']
num_samp_age = [(df[(18 <= df['age']) & (df['age'] <= 30)]).shape[0],(df[(30 < df['age']) & (df['age'] <= 40)]).shape[0],(df[(40 < df['age']) & (df['age'] <= 50)]).shape[0],(df[(50 < df['age']) & (df['age'] <= 60)]).shape[0],(df[(df['age'] > 60)]).shape[0]]
perc_age = [round((df[(18 <= df['age']) & (df['age'] <= 30)& (df['y'] == 1)]).shape[0]/num_samp_age[0]*100,2),round((df[(30 < df['age']) & (df['age'] <= 40)& (df['y'] == 1)]).shape[0]/num_samp_age[1]*100,2),round((df[(40 < df['age']) & (df['age'] <= 50)& (df['y'] == 1)]).shape[0]/num_samp_age[2]*100,2),round((df[(50 < df['age']) & (df['age'] <= 60)& (df['y'] == 1)]).shape[0]/num_samp_age[3]*100,2),round((df[(df['age'] > 60)& (df['y'] == 1)]).shape[0]/num_samp_age[4]*100,2)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_age, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_age, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Age feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Age range")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
def func(x):
if x >= 18 and x<=30:
return '18_30'
elif x > 30 and x<=40:
return '30_40'
elif x > 40 and x<=50:
return '40_50'
elif x > 50 and x<=60:
return '50_60'
else:
return 'over60'
df['age'] = df['age'].apply(func)
Job¶
print(df['job'].unique())
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown' 'retired' 'admin' 'services' 'self-employed' 'unemployed' 'housemaid' 'student']
print(df['job'].value_counts())
job blue-collar 9383 management 8166 technician 6852 admin 4483 services 3910 retired 1437 self-employed 1414 entrepreneur 1405 unemployed 1104 housemaid 1087 student 524 unknown 235 Name: count, dtype: int64
label = ['management','technician','entrepreneur','blue-collar','unknown','retired','admin','services','self-employed','unemployed','housemaid','student']
num_samp_job = []
perc_job = []
for l in label:
num_samp_job.append(df[df.job==l].shape[0])
for i,l in enumerate(label):
perc_job.append(round((df[(df.job==l) & (df['y'] == 1)]).shape[0]/num_samp_job[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_job, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_job, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Job feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Type of job")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
Student and retired seems to have the larger number of subscriber. Given the number of categories and samples it would be better to merge categories together. The categories unemployed, housemaid, student, unknown, retired and self-employed will be grouped together in the category 'Not full-time job'. The categories admin, technician, services will be grouped together in the category 'Office job' and the categories entrepreneur, management, blue-collar will be grouped together in the category 'High profile job'.
d = {'technician':'office_job','admin':'office_job','services':'office_job','unemployed':'not_full_time_job','housemaid':'not_full_time_job','student':'not_full_time_job','unknown':'not_full_time_job','retired':'not_full_time_job','self-employed':'not_full_time_job','entrepreneur':'high_profile_job','management':'high_profile_job','blue-collar':'high_profile_job'}
df['job'] = df['job'].replace(d)
df_sub['job'] = df_sub['job'].replace(d)
df_not_sub['job'] = df_not_sub['job'].replace(d)
C:\Users\PicardiC\AppData\Local\Temp\ipykernel_9684\2141811894.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\PicardiC\AppData\Local\Temp\ipykernel_9684\2141811894.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
print(df['job'].unique())
print(df_sub['job'].unique())
print(df_not_sub['job'].unique())
['high_profile_job' 'office_job' 'not_full_time_job'] ['office_job' 'high_profile_job' 'not_full_time_job'] ['high_profile_job' 'office_job' 'not_full_time_job']
print(df['job'].value_counts())
job high_profile_job 18954 office_job 15245 not_full_time_job 5801 Name: count, dtype: int64
label = ['high_profile_job','office_job','not_full_time_job' ]
num_samp_job = []
perc_job = []
for l in label:
num_samp_job.append(df[df.job==l].shape[0])
for i,l in enumerate(label):
perc_job.append(round((df[(df.job==l) & (df['y'] == 1)]).shape[0]/num_samp_job[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_job, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_job, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Job feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Type of job")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
Unexepectly the percentage of subscribers decrease with position grow, but this can be due to the limited number of sample in not a full-time job category.Also the result is determined by the two categories student and retired which have a larger number of subscriber.
Marital¶
print(df['marital'].unique())
['married' 'single' 'divorced']
print(df['marital'].value_counts())
marital married 24386 single 10889 divorced 4725 Name: count, dtype: int64
label = ['married','single', 'divorced']
num_samp_mar = []
perc_mar = []
for l in label:
num_samp_mar.append(df[df.marital==l].shape[0])
for i,l in enumerate(label):
perc_mar.append(round((df[(df.marital==l) & (df['y'] == 1)]).shape[0]/num_samp_mar[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_mar, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_mar, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Marital feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Marital status")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
The greatest subscriber percentage is in the single people. The reason of this could be related to less expenses or just related to sample size.
Education¶
print(df['education'].unique())
['tertiary' 'secondary' 'unknown' 'primary']
label = ['primary', 'secondary', 'tertiary', 'unknown']
num_samp_ed = []
perc_ed = []
for l in label:
num_samp_ed.append(df[df.education==l].shape[0])
for i,l in enumerate(label):
perc_ed.append(round((df[(df.education==l) & (df['y'] == 1)]).shape[0]/num_samp_ed[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_ed, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_ed, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Education feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Type of education")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
The percentage of subscriber increases with the level of education. This feature seems to affect the target.
Default¶
print(df['default'].unique())
['no' 'yes']
print(df['default'].value_counts())
default no 39191 yes 809 Name: count, dtype: int64
label = ['no', 'yes']
num_samp_def = []
perc_def = []
for l in label:
num_samp_def.append((df[df.default==l]).shape[0])
for i,l in enumerate(label):
perc_def.append(round((df[(df.default==l) & (df['y'] == 1)]).shape[0]/num_samp_def[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_def, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_def, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Default feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Account in default")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
df['default']=pd.factorize(df['default'])[0]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40000 entries, 0 to 39999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 40000 non-null object 1 job 40000 non-null object 2 marital 40000 non-null object 3 education 40000 non-null object 4 default 40000 non-null int64 5 balance 40000 non-null int64 6 housing 40000 non-null object 7 loan 40000 non-null object 8 contact 40000 non-null object 9 day 40000 non-null int64 10 month 40000 non-null object 11 duration 40000 non-null int64 12 campaign 40000 non-null int64 13 y 40000 non-null int64 dtypes: int64(6), object(8) memory usage: 4.3+ MB
print(df['default'].value_counts())
default 0 39191 1 809 Name: count, dtype: int64
As expected the majority of subscriber doesn't have a default on the account.
Balance¶
label = ['-2000, 0','0, 2000','2000, 4000','4000, 6000','6000, 8000','8000, 10000', '>10000']
num_samp_bal = [(df[(-2000 <= df['balance']) & (df['balance'] <= 0)]).shape[0],(df[(0 < df['balance']) & (df['balance'] <= 2000)]).shape[0],(df[(2000 < df['balance']) & (df['balance'] <= 4000)]).shape[0],(df[(4000 < df['balance']) & (df['balance'] <= 6000)]).shape[0],(df[(6000 < df['balance']) & (df['balance'] <= 8000)]).shape[0],(df[(8000 < df['balance']) & (df['balance'] <= 10000)]).shape[0],(df[(df['balance'] > 10000)].shape[0])]
perc_bal = [round((df_sub[(-2000 <= df_sub['balance']) & (df_sub['balance'] <= 0)]).shape[0]/num_samp_bal[0]*100,2),round((df_sub[(0 < df_sub['balance']) & (df_sub['balance'] <= 2000)]).shape[0]/num_samp_bal[1]*100,2),round((df_sub[(2000 < df_sub['balance']) & (df_sub['balance'] <= 4000)]).shape[0]/num_samp_bal[2]*100,2),round((df_sub[(4000 < df_sub['balance']) & (df_sub['balance'] <= 6000)]).shape[0]/num_samp_bal[3]*100,2),round((df_sub[(6000 < df_sub['balance']) & (df_sub['balance'] <= 8000)]).shape[0]/num_samp_bal[4]*100,2),round((df_sub[(8000 < df_sub['balance']) & (df_sub['balance'] <= 10000)]).shape[0]/num_samp_bal[5]*100,2),round((df_sub[(df_sub['balance'] > 10000)]).shape[0]/num_samp_bal[6]*100,2)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_bal, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_bal, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Balance feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Balance")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
The percentage of subscriber increases in the range 2000-4000 and 4000-6000, then decreases again for the range 6000-8000 and 8000-10000 finally increasing in balance>10000. So it seems that subscriber are more in people with medium and high-balance. Considering after 6k we have only few samples let's do a different grouping.
label = ['-2000,0','0-2000','2000-6000','>6000']
num_samp_bal = [(df[(-2000 <= df['balance']) & (df['balance'] <= 0)]).shape[0],(df[(0 < df['balance']) & (df['balance'] <= 2000)]).shape[0],(df[(2000 < df['balance']) & (df['balance'] <= 6000)]).shape[0],(df[(df['balance'] > 6000)]).shape[0]]
perc_bal = [round((df_sub[(-2000 <= df_sub['balance']) & (df_sub['balance'] <= 0)]).shape[0]/num_samp_bal[0]*100,2),round((df_sub[(0 < df_sub['balance']) & (df_sub['balance'] <= 2000)]).shape[0]/num_samp_bal[1]*100,2),round((df_sub[(2000 < df_sub['balance']) & (df_sub['balance'] <= 6000)]).shape[0]/num_samp_bal[2]*100,2),round((df_sub[(df_sub['balance'] > 6000)]).shape[0]/num_samp_bal[3]*100,2)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_bal, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_bal, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Balance feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Balance")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
def func(x):
if x >= 0 and x<=-2000:
return 'very_low_balance'
if x > 0 and x<=2000:
return 'medium_balance'
elif x > 2000 and x<=6000:
return 'high_balance'
else:
return 'very_high_balance'
df['balance'] = df['balance'].apply(func)
Housing¶
print(df['housing'].unique())
['yes' 'no']
print(df['housing'].value_counts())
housing yes 24031 no 15969 Name: count, dtype: int64
label = ['yes','no']
num_samp_hou = []
perc_hou = []
for l in label:
num_samp_hou.append((df[df.housing==l]).shape[0])
for i,l in enumerate(label):
perc_hou.append(round((df[(df.housing==l) & (df['y'] == 1)]).shape[0]/num_samp_hou[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_hou, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_hou, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Housing feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Housing loan")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
As expected the percentage of subscriber is greater in people without house loan
df['housing']=pd.factorize(df['housing'])[0]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40000 entries, 0 to 39999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 40000 non-null object 1 job 40000 non-null object 2 marital 40000 non-null object 3 education 40000 non-null object 4 default 40000 non-null int64 5 balance 40000 non-null object 6 housing 40000 non-null int64 7 loan 40000 non-null object 8 contact 40000 non-null object 9 day 40000 non-null int64 10 month 40000 non-null object 11 duration 40000 non-null int64 12 campaign 40000 non-null int64 13 y 40000 non-null int64 dtypes: int64(6), object(8) memory usage: 4.3+ MB
print(df['housing'].value_counts())
housing 0 24031 1 15969 Name: count, dtype: int64
Loan¶
print(df['loan'].unique())
['no' 'yes']
print(df['loan'].value_counts())
loan no 33070 yes 6930 Name: count, dtype: int64
label = ['yes','no']
num_samp_loan = []
perc_loan = []
for l in label:
num_samp_loan.append((df[df.loan==l]).shape[0])
for i,l in enumerate(label):
perc_loan.append(round((df[(df.loan==l) & (df['y'] == 1)]).shape[0]/num_samp_loan[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_loan, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_loan, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text= "Loan feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Personal loan")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
As expected the percentage of subscriber is greater in people without a personal loan.
df['loan']=pd.factorize(df['loan'])[0]
print(df['loan'].value_counts())
loan 0 33070 1 6930 Name: count, dtype: int64
Contact¶
print(df['contact'].unique())
['unknown' 'cellular' 'telephone']
print(df['contact'].value_counts())
contact cellular 24914 unknown 12765 telephone 2321 Name: count, dtype: int64
label = ['cellular', 'telephone', 'unknown']
num_samp_con = []
perc_con = []
for l in label:
num_samp_con.append((df[df.contact==l]).shape[0])
for i,l in enumerate(label):
perc_con.append(round((df[(df.contact==l) & (df['y'] == 1)]).shape[0]/num_samp_con[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_con, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_con, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Contact feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Mean of contact")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
People contacted with cellular seem to have the greatest number of subscriber, but it could just be related to the number of samples. Let's drop this feature.
Day¶
label = list(range(1,32))
num_samp_day = []
perc_day = []
for l in label:
num_samp_day.append((df[df.day==l]).shape[0])
for i,l in enumerate(label):
perc_day.append(round((df[(df.day==l) & (df['y'] == 1)]).shape[0]/num_samp_day[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_day, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_day, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Day feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Day of contact")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
The percentage of subscriber seems to increase the 13th day and 22nd.
Month¶
df['month'].unique()
array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb',
'mar', 'apr'], dtype=object)
label = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec']
num_samp_mon = []
perc_mon = []
for l in label:
num_samp_mon.append((df[df.month==l]).shape[0])
for i,l in enumerate(label):
perc_mon.append(round((df[(df.month==l) & (df['y'] == 1)]).shape[0]/num_samp_mon[i]*100,2))
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_mon, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_mon, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Month feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Month of last contact")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
October and march have the gratest number of subscriber but this could be due just to the sample size, so this variable can be dropped.
Duration¶
label = ['0-60','60-120','120-180','180-240','240-300','300-360','360-420','420-480','480-540','540-600','600-660','660-720','720-780','780-840','840-900','900-960','>960']
num_samp_dur = [(df[(0 <= df['duration']) & (df['duration'] <= 60)]).shape[0],(df[(60 < df['duration']) & (df['duration'] <= 120)]).shape[0],(df[(120 < df['duration']) & (df['duration'] <= 180)]).shape[0],(df[(180 < df['duration']) & (df['duration'] <= 240)]).shape[0],(df[(240 < df['duration']) & (df['duration'] <= 300)]).shape[0],(df[(300 < df['duration']) & (df['duration'] <= 360)]).shape[0],(df[(360 < df['duration']) & (df['duration'] <= 420)]).shape[0],(df[(420 < df['duration']) & (df['duration'] <= 480)]).shape[0],(df[(480 < df['duration']) & (df['duration'] <= 540)]).shape[0],(df[(540 < df['duration']) & (df['duration'] <= 600)]).shape[0],(df[(600 < df['duration']) & (df['duration'] <= 660)]).shape[0],(df[(660 < df['duration']) & (df['duration'] <= 720)]).shape[0],(df[(720 < df['duration']) & (df['duration'] <= 780)]).shape[0],(df[(780 < df['duration']) & (df['duration'] <= 840)]).shape[0],(df[(840 < df['duration']) & (df['duration'] <= 900)]).shape[0],(df[(900 < df['duration']) & (df['duration'] <= 960)]).shape[0],(df[(df['duration'] > 960)].shape[0])]
perc_dur = [round((df_sub[(0 <= df_sub['duration']) & (df_sub['duration'] <= 60)]).shape[0]/num_samp_dur[0]*100,2),round((df_sub[(60 < df_sub['duration']) & (df_sub['duration'] <= 120)]).shape[0]/num_samp_dur[1]*100,2),round((df_sub[(120 < df_sub['duration']) & (df_sub['duration'] <= 180)]).shape[0]/num_samp_dur[2]*100,2),round((df_sub[(180 < df_sub['duration']) & (df_sub['duration'] <= 240)]).shape[0]/num_samp_dur[3]*100,2),round((df_sub[(240 < df_sub['duration']) & (df_sub['duration'] <= 300)]).shape[0]/num_samp_dur[4]*100,2),round((df_sub[(300 < df_sub['duration']) & (df_sub['duration'] <= 360)]).shape[0]/num_samp_dur[5]*100,2),round((df_sub[(360 < df_sub['duration']) & (df_sub['duration'] <= 420)]).shape[0]/num_samp_dur[6]*100,2),round((df_sub[(420 < df_sub['duration']) & (df_sub['duration'] <= 480)]).shape[0]/num_samp_dur[7]*100,2),round((df_sub[(480 < df_sub['duration']) & (df_sub['duration'] <= 540)]).shape[0]/num_samp_dur[8]*100,2),round((df_sub[(540 < df_sub['duration']) & (df_sub['duration'] <= 600)]).shape[0]/num_samp_dur[9]*100,2),round((df_sub[(600 < df_sub['duration']) & (df_sub['duration'] <= 660)]).shape[0]/num_samp_dur[10]*100,2),round((df_sub[(660 < df_sub['duration']) & (df_sub['duration'] <= 720)]).shape[0]/num_samp_dur[11]*100,2),round((df_sub[(720 < df_sub['duration']) & (df_sub['duration'] <= 780)]).shape[0]/num_samp_dur[12]*100,2),round((df_sub[(780 < df_sub['duration']) & (df_sub['duration'] <= 840)]).shape[0]/num_samp_dur[13]*100,2),round((df_sub[(840 < df_sub['duration']) & (df_sub['duration'] <= 900)]).shape[0]/num_samp_dur[14]*100,2),round((df_sub[(900 < df_sub['duration']) & (df_sub['duration'] <= 960)]).shape[0]/num_samp_dur[15]*100,2),round((df_sub[(df_sub['duration'] > 960)]).shape[0]/num_samp_dur[16]*100,2)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_dur, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_dur, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Duration feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Duration of last contact")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
The percentage of subscriber increase with the duration of the last contact. Let's try a different grouping
label = ['0-180','180-360','>360']
num_samp_duration = [(df[(0 <= df['duration']) & (df['duration'] <= 180)]).shape[0],(df[(180 < df['duration']) & (df['duration'] <= 360)]).shape[0],(df[(df['duration'] > 360)].shape[0])]
perc_duration = [round((df_sub[(0 <= df_sub['duration']) & (df_sub['duration'] <= 180)]).shape[0]/num_samp_duration[0]*100,2),round((df_sub[(180 < df_sub['duration']) & (df_sub['duration'] <= 360)]).shape[0]/num_samp_duration[1]*100,2),round((df_sub[(df_sub['duration'] > 360)]).shape[0]/num_samp_duration[2]*100,2)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_duration, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_duration, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Duration feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Duration of last contact")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
def func(x):
if x >= 0 and x<=180:
return '0_3_minutes'
elif x>180 and x<=360:
return '3_6_minutes'
else:
return 'greater_than_6_minutes'
df['duration'] = df['duration'].apply(func)
Campaign¶
label = ['1-4','4-8','8-12','>12']
num_samp_cam = [(df[(1 <= df['campaign']) & (df['campaign'] <= 4)]).shape[0],(df[(4 < df['campaign']) & (df['campaign'] <= 8)]).shape[0],(df[(8 < df['campaign']) & (df['campaign'] <= 12)]).shape[0],(df[(df['campaign'] > 12)].shape[0])]
perc_cam = [round((df_sub[(1 <= df_sub['campaign']) & (df_sub['campaign'] <= 4)]).shape[0]/num_samp_cam[0]*100,2),round((df_sub[(4 < df_sub['campaign']) & (df_sub['campaign'] <= 8)]).shape[0]/num_samp_cam[1]*100,2),round((df_sub[(8 < df_sub['campaign']) & (df_sub['campaign'] <= 12)]).shape[0]/num_samp_cam[2]*100,2),round((df_sub[(df_sub['campaign'] > 12)]).shape[0]/num_samp_cam[3]*100,2)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=perc_cam, name="Subscriber percentage"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samp_cam, name="Number of samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Campaign feature"
)
# Set x-axis title
fig.update_xaxes(title_text="Number of contacts for the last campaign")
# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)
fig
Apparently the number of subscriber decreases with the number of contacts in the campaign. This feature can be dropped.
Dropping feature and converting categorical¶
df = df.drop(columns=['contact','day','month','campaign'])
df.head()
| age | job | marital | education | default | balance | housing | loan | duration | y | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50_60 | high_profile_job | married | tertiary | 0 | high_balance | 0 | 0 | 3_6_minutes | 0 |
| 1 | 40_50 | office_job | single | secondary | 0 | medium_balance | 0 | 0 | 0_3_minutes | 0 |
| 2 | 30_40 | high_profile_job | married | secondary | 0 | medium_balance | 0 | 1 | 0_3_minutes | 0 |
| 3 | 40_50 | high_profile_job | married | unknown | 0 | medium_balance | 0 | 0 | 0_3_minutes | 0 |
| 4 | 30_40 | not_full_time_job | single | unknown | 0 | medium_balance | 1 | 0 | 3_6_minutes | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40000 entries, 0 to 39999 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 40000 non-null object 1 job 40000 non-null object 2 marital 40000 non-null object 3 education 40000 non-null object 4 default 40000 non-null int64 5 balance 40000 non-null object 6 housing 40000 non-null int64 7 loan 40000 non-null int64 8 duration 40000 non-null object 9 y 40000 non-null int64 dtypes: int64(4), object(6) memory usage: 3.1+ MB
#scaler = MinMaxScaler()
#df['age'] = scaler.fit_transform(df['age'].values.reshape(-1,1))
#df['balance'] = scaler.fit_transform(df['balance'].values.reshape(-1,1))
#df['duration'] = scaler.fit_transform(df['duration'].values.reshape(-1,1))
df.describe()
| default | housing | loan | y | |
|---|---|---|---|---|
| count | 40000.000000 | 40000.000000 | 40000.000000 | 40000.000000 |
| mean | 0.020225 | 0.399225 | 0.173250 | 0.072400 |
| std | 0.140771 | 0.489745 | 0.378468 | 0.259152 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
df_term = pd.get_dummies(df, columns = ['age','marital','education','duration','job','balance'])
df_term.head()
| default | housing | loan | y | age_18_30 | age_30_40 | age_40_50 | age_50_60 | age_over60 | marital_divorced | ... | education_unknown | duration_0_3_minutes | duration_3_6_minutes | duration_greater_than_6_minutes | job_high_profile_job | job_not_full_time_job | job_office_job | balance_high_balance | balance_medium_balance | balance_very_high_balance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | False | False | False | True | False | False | ... | False | False | True | False | True | False | False | True | False | False |
| 1 | 0 | 0 | 0 | 0 | False | False | True | False | False | False | ... | False | True | False | False | False | False | True | False | True | False |
| 2 | 0 | 0 | 1 | 0 | False | True | False | False | False | False | ... | False | True | False | False | True | False | False | False | True | False |
| 3 | 0 | 0 | 0 | 0 | False | False | True | False | False | False | ... | True | True | False | False | True | False | False | False | True | False |
| 4 | 0 | 1 | 0 | 0 | False | True | False | False | False | False | ... | True | False | True | False | False | True | False | False | True | False |
5 rows × 25 columns
Prepare data¶
y = df_term['y']
X = df_term.drop(columns = ['y'])
print(X.shape, y.shape)
(40000, 24) (40000,)
X.head()
| default | housing | loan | age_18_30 | age_30_40 | age_40_50 | age_50_60 | age_over60 | marital_divorced | marital_married | ... | education_unknown | duration_0_3_minutes | duration_3_6_minutes | duration_greater_than_6_minutes | job_high_profile_job | job_not_full_time_job | job_office_job | balance_high_balance | balance_medium_balance | balance_very_high_balance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | False | False | False | True | False | False | True | ... | False | False | True | False | True | False | False | True | False | False |
| 1 | 0 | 0 | 0 | False | False | True | False | False | False | False | ... | False | True | False | False | False | False | True | False | True | False |
| 2 | 0 | 0 | 1 | False | True | False | False | False | False | True | ... | False | True | False | False | True | False | False | False | True | False |
| 3 | 0 | 0 | 0 | False | False | True | False | False | False | True | ... | True | True | False | False | True | False | False | False | True | False |
| 4 | 0 | 1 | 0 | False | True | False | False | False | False | False | ... | True | False | True | False | False | True | False | False | True | False |
5 rows × 24 columns
Train and evaluate different models¶
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
print(X_test.shape)
(8000, 24)
counter = Counter(y_train)
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print(counter[0])
print(counter[1])
print('Estimate: %.3f' % estimate)
29669 2331 Estimate: 12.728
svm = SVC()
tree = DecisionTreeClassifier()
rf = RandomForestClassifier(max_depth=2, random_state=0)
bst = XGBClassifier(n_estimators=5000, max_depth=5, learning_rate=0.1, objective='binary:logistic',scale_pos_weight=estimate)
# Training the models
svm.fit(X_train, y_train)
tree.fit(X_train, y_train)
rf.fit(X_train, y_train)
bst.fit(X_train, y_train)
# Making predictions with each model
svm_preds = svm.predict(X_test)
tree_preds = tree.predict(X_test)
rf_preds = rf.predict(X_test)
bst_preds = bst.predict(X_test)
model_preds = {
"Support Vector Machine": svm_preds,
"Decision Tree": tree_preds,
"Random forest": rf_preds,
"XGBoost": bst_preds
}
for model, preds in model_preds.items():
print(f"{model} Results:\n{classification_report(y_test, preds)}", sep="\n\n")
Support Vector Machine Results:
precision recall f1-score support
0 0.93 1.00 0.96 7435
1 0.45 0.02 0.03 565
accuracy 0.93 8000
macro avg 0.69 0.51 0.50 8000
weighted avg 0.90 0.93 0.90 8000
Decision Tree Results:
precision recall f1-score support
0 0.93 0.99 0.96 7435
1 0.32 0.07 0.11 565
accuracy 0.92 8000
macro avg 0.63 0.53 0.54 8000
weighted avg 0.89 0.92 0.90 8000
Random forest Results:
precision recall f1-score support
0 0.93 1.00 0.96 7435
1 0.00 0.00 0.00 565
accuracy 0.93 8000
macro avg 0.46 0.50 0.48 8000
weighted avg 0.86 0.93 0.90 8000
XGBoost Results:
precision recall f1-score support
0 0.98 0.80 0.88 7435
1 0.22 0.77 0.35 565
accuracy 0.80 8000
macro avg 0.60 0.78 0.61 8000
weighted avg 0.92 0.80 0.84 8000
C:\Users\PicardiC\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. C:\Users\PicardiC\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. C:\Users\PicardiC\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
XGBoost it is the best model, because also if the accuracy is lower compared to the other models it has a recall 0.77 while the others have very low recall and precision
confusion_matrix(y_test.values, bst_preds)
array([[5930, 1505],
[ 132, 433]], dtype=int64)
tn, fp, fn, tp = confusion_matrix(y_test.values, bst_preds).ravel()
print(f"Number of true positive: {tp}, number of false negative:{fn}, number of true negatives: {tn}, number of false positive {fp}.")
Number of true positive: 433, number of false negative:132, number of true negatives: 5930, number of false positive 1505.
Features importance¶
sorted_idx = bst.feature_importances_.argsort()
list_of_tuples = list(zip(X_test.columns[sorted_idx], bst.feature_importances_[sorted_idx]))
df = pd.DataFrame(list_of_tuples,columns=['Features', 'Importance scores'])
print(df)
Features Importance scores 0 education_unknown 0.006056 1 default 0.006507 2 age_40_50 0.007290 3 marital_divorced 0.007556 4 balance_medium_balance 0.008261 5 job_not_full_time_job 0.008733 6 education_secondary 0.008873 7 age_30_40 0.009568 8 balance_very_high_balance 0.009723 9 job_high_profile_job 0.009845 10 marital_single 0.010031 11 age_50_60 0.011445 12 marital_married 0.011711 13 loan 0.012512 14 job_office_job 0.012652 15 education_primary 0.014126 16 balance_high_balance 0.014973 17 duration_3_6_minutes 0.015843 18 education_tertiary 0.019177 19 age_18_30 0.019609 20 housing 0.020472 21 age_over60 0.036308 22 duration_0_3_minutes 0.064564 23 duration_greater_than_6_minutes 0.654168
# bult-in feature importance
fig = px.bar(x=X_test.columns[sorted_idx], y=bst.feature_importances_[sorted_idx], labels={'x':'Feature', 'y':'Importance scores'})
fig
Diagnostic_graph¶
X_test.info()
<class 'pandas.core.frame.DataFrame'> Index: 8000 entries, 3841 to 32191 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 default 8000 non-null int64 1 housing 8000 non-null int64 2 loan 8000 non-null int64 3 age_18_30 8000 non-null bool 4 age_30_40 8000 non-null bool 5 age_40_50 8000 non-null bool 6 age_50_60 8000 non-null bool 7 age_over60 8000 non-null bool 8 marital_divorced 8000 non-null bool 9 marital_married 8000 non-null bool 10 marital_single 8000 non-null bool 11 education_primary 8000 non-null bool 12 education_secondary 8000 non-null bool 13 education_tertiary 8000 non-null bool 14 education_unknown 8000 non-null bool 15 duration_0_3_minutes 8000 non-null bool 16 duration_3_6_minutes 8000 non-null bool 17 duration_greater_than_6_minutes 8000 non-null bool 18 job_high_profile_job 8000 non-null bool 19 job_not_full_time_job 8000 non-null bool 20 job_office_job 8000 non-null bool 21 balance_high_balance 8000 non-null bool 22 balance_medium_balance 8000 non-null bool 23 balance_very_high_balance 8000 non-null bool dtypes: bool(21), int64(3) memory usage: 414.1 KB
def make_diagnostic_graph(dict_pred, dict_act, values, ylabel):
for ind, el in np.ndenumerate(bst_preds):
dict_pred[values[ind[0]]].append(el)
dict_act[values[ind[0]]].append(y_test.values[ind[0]])
mean_pred =[]
for key in list(dict_pred.keys()):
if len(dict_pred[key]) != 0:
mean_pred.append(sum(dict_pred[key])/len(dict_pred[key]))
else:
mean_pred.append(0)
mean_act =[]
for key in dict_act.keys():
if len(dict_act[key]) != 0:
mean_act.append(sum(dict_act[key])/len(dict_act[key]))
else:
mean_act.append(0)
upper_bound = max(max(mean_pred,mean_act))+0.05
r = [0,upper_bound]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Bar(x=list(dict_pred.keys()), y=mean_pred, name="Avg predicted values",width=0.3),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=list(dict_pred.keys()), y=mean_act, name="Avg actual values",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Diagnostic diagram"
)
# Set x-axis title
fig.update_xaxes(title_text=ylabel)
# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted values", range=r, secondary_y=False)
fig.update_yaxes(title_text="Avg actual values", range=r, secondary_y=True)
fig.show()
def make_pred_act(dict_pred, dict_act, values):
for ind, el in np.ndenumerate(bst_preds):
dict_pred[values[ind[0]]].append(el)
dict_act[values[ind[0]]].append(y_test.values[ind[0]])
mean_pred =[]
for key in list(dict_pred.keys()):
if len(dict_pred[key]) != 0:
mean_pred.append(sum(dict_pred[key])/len(dict_pred[key]))
else:
mean_pred.append(0)
mean_act =[]
for key in dict_act.keys():
if len(dict_act[key]) != 0:
mean_act.append(sum(dict_act[key])/len(dict_act[key]))
else:
mean_act.append(0)
return mean_pred[1], mean_act[1]
def get_num_samples(vect):
count = 0
for el in vect:
if el==1:
count = count+1
return count
zero_one_dict_pred = {0:[], 1:[]}
zero_one_dict_act = {0:[], 1:[]}
make_diagnostic_graph(zero_one_dict_pred, zero_one_dict_act, X_test.default.values,"Default")
zero_one_dict_pred = {0:[], 1:[]}
zero_one_dict_act = {0:[], 1:[]}
make_diagnostic_graph(zero_one_dict_pred, zero_one_dict_act, X_test.housing.values,"Housing")
zero_one_dict_pred = {0:[], 1:[]}
zero_one_dict_act = {0:[], 1:[]}
make_diagnostic_graph(zero_one_dict_pred, zero_one_dict_act, X_test.loan.values,"Loan")
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_18_30,mean_act_18_30 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_18_30.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_30_40,mean_act_30_40 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_30_40.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_40_50,mean_act_40_50 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_40_50.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_50_60,mean_act_50_60 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_50_60.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_over60,mean_act_over60 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_over60.values)
mean_pred = [mean_pred_18_30, mean_pred_30_40, mean_pred_40_50, mean_pred_50_60, mean_pred_over60]
mean_act = [mean_act_18_30, mean_act_30_40, mean_act_40_50, mean_act_50_60, mean_act_over60]
label =['18-30','30-40','40-50','50-60','over60']
num_samples = [get_num_samples(X_test.age_18_30.values),get_num_samples(X_test.age_30_40.values),get_num_samples(X_test.age_40_50.values),get_num_samples(X_test.age_50_60.values),get_num_samples(X_test.age_over60.values)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=label, y=mean_act, name="Avg actual values"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Diagnostic diagram"
)
# Set x-axis title
fig.update_xaxes(title_text='Age')
# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)
fig.show()
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_div,mean_act_div = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.marital_divorced.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_mar,mean_act_mar = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.marital_married.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act
act_sin = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.marital_single.values)
mean_pred = [mean_pred_div, mean_pred_mar, mean_pred_sin]
mean_act = [mean_act_div, mean_act_mar, mean_act_sin]
label =['Divorced','Married','Single']
num_samples = [get_num_samples(X_test.marital_divorced.values),get_num_samples(X_test.marital_married.values),get_num_samples(X_test.marital_single.values)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=label, y=mean_act, name="Avg actual values"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Diagnostic diagram"
)
# Set x-axis title
fig.update_xaxes(title_text='Marital status')
# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)
fig.show()
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_1,mean_act_1 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.education_primary.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_2,mean_act_2 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.education_secondary.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_3,mean_act_3 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.education_tertiary.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_4,mean_act_4 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.education_unknown.values)
mean_pred = [mean_pred_1, mean_pred_2, mean_pred_3, mean_pred_4]
mean_act = [mean_act_1, mean_act_2, mean_act_3, mean_act_4]
label =['Primary','Secondary','Tertiary','Unknown']
num_samples = [get_num_samples(X_test.education_primary.values),get_num_samples(X_test.education_secondary.values),get_num_samples(X_test.education_tertiary.values),get_num_samples(X_test.education_unknown.values)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=label, y=mean_act, name="Avg actual values"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Diagnostic diagram"
)
# Set x-axis title
fig.update_xaxes(title_text='Education')
# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)
fig.show()
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_1,mean_act_1 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.duration_0_3_minutes.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_2,mean_act_2 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.duration_3_6_minutes.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_3,mean_act_3 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.duration_greater_than_6_minutes.values)
mean_pred = [mean_pred_1, mean_pred_2, mean_pred_3]
mean_act = [mean_act_1, mean_act_2, mean_act_3]
label =['0-3 minutes','3-6 minutes','greater than 6 minutes']
num_samples = [get_num_samples(X_test.duration_0_3_minutes.values),get_num_samples(X_test.duration_3_6_minutes.values),get_num_samples(X_test.duration_greater_than_6_minutes.values)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=label, y=mean_act, name="Avg actual values"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Diagnostic diagram"
)
# Set x-axis title
fig.update_xaxes(title_text='Duration')
# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)
fig.show()
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_1,mean_act_1 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.job_high_profile_job.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_2,mean_act_2 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.job_not_full_time_job.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_3,mean_act_3 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.job_office_job.values)
mean_pred = [mean_pred_1, mean_pred_2, mean_pred_3]
mean_act = [mean_act_1, mean_act_2, mean_act_3]
label =['High-profile','Not full-time','Office']
num_samples = [get_num_samples(X_test.job_high_profile_job.values),get_num_samples(X_test.job_not_full_time_job.values),get_num_samples(X_test.job_office_job.values)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=label, y=mean_act, name="Avg actual values"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Diagnostic diagram"
)
# Set x-axis title
fig.update_xaxes(title_text='Job')
# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)
fig.show()
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_1,mean_act_1 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.balance_medium_balance.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_2,mean_act_2 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.balance_high_balance.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_3,mean_act_3 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.balance_very_high_balance.values)
mean_pred = [mean_pred_1, mean_pred_2, mean_pred_3]
mean_act = [mean_act_1, mean_act_2, mean_act_3]
label =['Medium balance','High balance','Very high-balance']
num_samples = [get_num_samples(X_test.balance_medium_balance.values),get_num_samples(X_test.balance_high_balance.values),get_num_samples(X_test.balance_very_high_balance.values)]
# Create Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=label, y=mean_act, name="Avg actual values"),
secondary_y=False,
)
fig.add_trace(
go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Diagnostic diagram"
)
# Set x-axis title
fig.update_xaxes(title_text='Balance')
# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)
fig.show()
Training and evaluate XBoost with cross-validation¶
Data are divided in two sets: data for Cross Validation, which we will call train_test and data for testing the final models, which we will call gtest for global test.
X_train_test, X_gtest, y_train_test, y_gtest = train_test_split(X, y, test_size=0.20, random_state=1)
counter = Counter(y_train_test)
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print(counter[0])
print(counter[1])
print('Estimate: %.3f' % estimate)
29669 2331 Estimate: 12.728
Given the previous results, only XGBoost is considered.
bst = XGBClassifier(n_estimators=5000, max_depth=5, learning_rate=0.1, objective='binary:logistic',scale_pos_weight=estimate)
# Cross-validation
cv_results_bst = cross_validate(bst, X_train_test, y_train_test, cv=5, return_estimator=True)
model_preds = {
#"Logistic Regression": scores_lr.mean(),
"XGBoost": cv_results_bst['test_score'].mean()
}
for model, scores in model_preds.items():
print(f"{model} Mean accuracy:\n{scores}", sep="\n\n")
XGBoost Mean accuracy: 0.796125
def global_test_score(cv_results):
gtest_score = []
for i in range(len(cv_results['estimator'])):
gtest_score.append(cv_results['estimator'][i].score(X_gtest, y_gtest))
return gtest_score
model_preds = {
#"Logistic Regression": scores_lr.mean(),
"XGBoost": global_test_score(cv_results_bst)
}
for model, scores in model_preds.items():
print(f"{model} Mean accuracy:\n{sum(scores) / len(scores)}", sep="\n\n")
XGBoost Mean accuracy: 0.7994499999999999
print(cv_results_bst['test_score'])
[0.80640625 0.79796875 0.78625 0.79484375 0.79515625]
scores = global_test_score(cv_results_bst)
print(scores)
[0.8, 0.80025, 0.797375, 0.8, 0.799625]
bst = cv_results_bst['estimator'][1]
bst_preds = bst.predict(X_gtest)
y_gtest.shape
(8000,)
subscriber = []
for x in y_gtest:
if x ==1:
subscriber.append(x)
print(len(subscriber))
565
confusion_matrix(y_gtest.values, bst_preds)
array([[5982, 1453],
[ 145, 420]], dtype=int64)
tn, fp, fn, tp = confusion_matrix(y_test.values, bst_preds).ravel()
print(f"Number of true positive: {tp}, number of false negative:{fn}, number of true negatives: {tn}, number of false positive {fp}.")
Number of true positive: 420, number of false negative:145, number of true negatives: 5982, number of false positive 1453.
print(cv_results_bst['test_score'])
[0.80640625 0.79796875 0.78625 0.79484375 0.79515625]
print(f"Results:\n{classification_report(y_gtest, bst_preds)}", sep="\n\n")
Results:
precision recall f1-score support
0 0.98 0.80 0.88 7435
1 0.22 0.74 0.34 565
accuracy 0.80 8000
macro avg 0.60 0.77 0.61 8000
weighted avg 0.92 0.80 0.84 8000
The model reach an accuracy of 80% with high precision and recall for the negative class. For the positive class we have a very low precision and a recall of 77%. The results for the posive class can be improved collecting more samples.
Conclusion¶
The unbalanced dataset make impossible create an accurate model to correctly identify positive samples. More samples of subscriber should be collected or synthetized. In order to partially balancing the dataset I used the weight matrix of the classifier. From the feature importance analysis on the best model the duration of the call grater than 6 minutes and the age over 60 seems to be key feature in order to predict subscriber. This confirm the result of the data analysis where the retired people and over 60 are more incline to buy and people with negative balance and active loans are less incline to buy. Additionally From the data analysis the number of subscriber seems also to increase with the education grade and the students are more incline to subscribe.